from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter as rcts1

pdfloader = PyMuPDFLoader("./langchain_docs/llama2.pdf")
pages = pdfloader.load_and_split()
#print(pages[0].page_content)

print("------------------------------------------------------")
#切分
#使用langchain.text_splitter中的类
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100,length_function=len,add_start_index=True)
paragraphs = splitter.create_documents([pages[0].page_content])
for paragraph in paragraphs:
    print(paragraph.page_content)
    print('==============')

#切分
#使用langchain_text_splitters中的类
splitter = rcts1(chunk_size=300, chunk_overlap=100,length_function=len,add_start_index=True)
paragraphs = splitter.create_documents([pages[0].page_content])
for paragraph in paragraphs:
    print(paragraph.page_content)
    print('==============')